import os, sys
import numpy as np
import pandas as pd
import pandas_profiling
import codecs
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV #####Revisar esta librería.
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from IPython.display import Image
from sklearn.tree import export_graphviz
from joblib import dump, load
dataTrain = pd.read_excel("train.xlsx")
dataTest = pd.read_excel("test.xlsx")
googleResultsTrain = pd.read_csv("resultados_google_train.csv")
googleResultsTest = pd.read_csv("resultados_google_test.csv")
spanishCorrector_Train = pd.read_csv("SpanishCorrector_Train.csv")
spanishCorrector_Test = pd.read_csv("SpanishCorrector_Test.csv")
with codecs.open('positive_words_es.txt','r',encoding='utf8') as f:
positive_words= f.read()
positive_words=positive_words.split('\r\n')
with codecs.open('negative_words_es.txt','r',encoding='utf8') as f:
negative_words= f.read()
negative_words=negative_words.split('\r\n')
dataTrain.head()
# % Mayusculas
def mayusculas(str):
numMayus = sum(1 for c in str if c.isupper())
return numMayus/sum(1 for c in str)
# total de signos de interrogación
def numInterrogacionTot(str):
count = 0
for i in range (0, len (str)):
#Checks whether given character is a punctuation mark
if str[i] in ("¿","?"):
count = count + 1;
return count
# porcentaje signos de interrogacion
def numInterrogacionRel(str):
count = 0
for i in range (0, len (str)):
#Checks whether given character is a punctuation mark
if str[i] in ("¿","?"):
count = count + 1;
return count/sum(1 for c in str)
# total de signos de exclamacion
def numExclamacionTot(str):
count = 0
for i in range (0, len (str)):
#Checks whether given character is a punctuation mark
if str[i] in ("¡","!"):
count = count + 1;
return count
# porcentaje signos de exclamacion
def numExclamacionRel(str):
count = 0
for i in range (0, len (str)):
#Checks whether given character is a punctuation mark
if str[i] in ("¡","!"):
count = count + 1;
return count/sum(1 for c in str)
# palabras positivas
def positiveTot(str):
# break the string into list of words
str = str.split()
# loop till string values present in list str
count = 0
for i in range (0, len (str)):
if str[i] in positive_words:
count = count + 1;
return count
def positiveRel(str):
# break the string into list of words
str = str.split()
# loop till string values present in list str
count = 0
for i in range (0, len (str)):
if str[i] in positive_words:
count = count + 1;
return count/len(str)
# palabras negativas
def negativeTot(str):
# break the string into list of words
str = str.split()
# loop till string values present in list str
count = 0
for i in range (0, len (str)):
if str[i] in negative_words:
count = count + 1;
return count
def negativeRel(str):
# break the string into list of words
str = str.split()
# loop till string values present in list str
count = 0
for i in range (0, len (str)):
if str[i] in negative_words:
count = count + 1
return count/len(str)
# palabras unicas / palabras totales
def redundancia(str):
# break the string into list of words
str = str.split()
str2 = []
# loop till string values present in list str
for i in range (0, len (str)):
# checking for the duplicacy
if str[i] not in str2:
# insert value in str2
str2.append(str[i])
rta = len(str2)/len(str)
return rta
#Contar *NUMBER*
def num(str):
# break the string into list of words
str = str.split()
str2 = []
# loop till string values present in list str
for i in str:
# checking for the duplicacy
if i not in str2:
# insert value in str2
str2.append(i)
num = 0
for i in range(0, len(str2)):
if str2[i] == "*NUMBER*":
num = str.count(str2[i])
else:
continue
return num
def numRel(str):
# break the string into list of words
str = str.split()
str2 = []
# loop till string values present in list str
for i in str:
# checking for the duplicacy
if i not in str2:
# insert value in str2
str2.append(i)
num = 0
for i in range(0, len(str2)):
if str2[i] == "*NUMBER*":
num = str.count(str2[i])
else:
continue
return num/len(str)
def numResults(str):
if(str=="No results"):
return 0
else:
# Quitar "Cerca de"
str = str.replace("Cerca de", "")
# Obtener el número
str = str.split()
num= str[0]
num = num.replace(",", "")
return int(num)
# total de comillas
def numComillasTot(str):
count = 0
for i in range (0, len (str)):
#Checks whether given character is a punctuation mark
if str[i] in ("«","»","“","”","‘","’","'","\""):
count = count + 1;
return count
# porcentaje comillas
def numComillasRel(str):
count = 0
for i in range (0, len (str)):
#Checks whether given character is a punctuation mark
if str[i] in ("«","»","“","”","‘","’","'","\""):
count = count + 1;
return count/sum(1 for c in str)
dataTrain['Category'] = dataTrain['Category'].apply(lambda x: 1 if x=="True" else 0)
dataTrain['%MayusculasHeadLine'] = dataTrain['Headline'].apply(lambda x: mayusculas(x))
dataTrain['#SignosInterrogación'] = dataTrain['Text'].apply(lambda x: numInterrogacionTot(x))
dataTrain['%SignosInterrogación'] = dataTrain['Text'].apply(lambda x: numInterrogacionRel(x))
dataTrain['#SignosExclamación'] = dataTrain['Text'].apply(lambda x: numExclamacionTot(x))
dataTrain['%SignosExclamación'] = dataTrain['Text'].apply(lambda x: numExclamacionRel(x))
dataTrain['#PalabrasPositivas'] = dataTrain['Text'].apply(lambda x: positiveTot(x))
dataTrain['%PalabrasPositivas'] = dataTrain['Text'].apply(lambda x: positiveRel(x))
dataTrain['#PalabrasNegativas'] = dataTrain['Text'].apply(lambda x: negativeTot(x))
dataTrain['%PalabrasNegativas'] = dataTrain['Text'].apply(lambda x: negativeRel(x))
dataTrain['Palabras unicas/palabras totales'] = dataTrain['Text'].apply(lambda x: redundancia(x))
dataTrain['#Numeros'] = dataTrain['Text'].apply(lambda x: num(x))
dataTrain['%Numeros'] = dataTrain['Text'].apply(lambda x: numRel(x))
dataTrain['#Comillas'] = dataTrain['Text'].apply(lambda x: numComillasTot(x))
dataTrain['%Comillas'] = dataTrain['Text'].apply(lambda x: numComillasRel(x))
dataTrain['#ResultadosGoogle'] = googleResultsTrain['GSearch'].apply(lambda x: numResults(x))
dataTrain['#ResultadosGoogleNews'] = googleResultsTrain['GSearchNews'].apply(lambda x: numResults(x))
dataTrain['0ResultadosGoogleNews'] = dataTrain['#ResultadosGoogleNews'].apply(lambda x: 1 if x==0 else 0)
dataTrain['#Mistakes'] = spanishCorrector_Train['Inconsistency']+spanishCorrector_Train['Grammar']+spanishCorrector_Train['Typographical']+spanishCorrector_Train['Spacing']
dataTest['Category'] = dataTest['Category'].apply(lambda x: 1 if x=="True" else 0)
dataTest['%MayusculasHeadLine'] = dataTest['Headline'].apply(lambda x: mayusculas(x))
dataTest['#SignosInterrogación'] = dataTest['Text'].apply(lambda x: numInterrogacionTot(x))
dataTest['%SignosInterrogación'] = dataTest['Text'].apply(lambda x: numInterrogacionRel(x))
dataTest['#SignosExclamación'] = dataTest['Text'].apply(lambda x: numExclamacionTot(x))
dataTest['%SignosExclamación'] = dataTest['Text'].apply(lambda x: numExclamacionRel(x))
dataTest['#PalabrasPositivas'] = dataTest['Text'].apply(lambda x: positiveTot(x))
dataTest['%PalabrasPositivas'] = dataTest['Text'].apply(lambda x: positiveRel(x))
dataTest['#PalabrasNegativas'] = dataTest['Text'].apply(lambda x: negativeTot(x))
dataTest['%PalabrasNegativas'] = dataTest['Text'].apply(lambda x: negativeRel(x))
dataTest['Palabras unicas/palabras totales'] = dataTest['Text'].apply(lambda x: redundancia(x))
dataTest['#Numeros'] = dataTest['Text'].apply(lambda x: num(x))
dataTest['%Numeros'] = dataTest['Text'].apply(lambda x: numRel(x))
dataTest['#Comillas'] = dataTest['Text'].apply(lambda x: numComillasTot(x))
dataTest['%Comillas'] = dataTest['Text'].apply(lambda x: numComillasRel(x))
dataTest['#ResultadosGoogle'] = googleResultsTest['GSearch'].apply(lambda x: numResults(x))
dataTest['#ResultadosGoogleNews'] = googleResultsTest['GSearchNews'].apply(lambda x: numResults(x))
dataTest['0ResultadosGoogleNews'] = dataTest['#ResultadosGoogleNews'].apply(lambda x: 1 if x==0 else 0)
dataTest['#Mistakes'] = spanishCorrector_Test['Inconsistency']+spanishCorrector_Test['Grammar']+spanishCorrector_Test['Typographical']+spanishCorrector_Test['Spacing']
dataTrain.loc[3, :]
pandas_profiling.ProfileReport(dataTrain)